import re
text = ''
file = open('hamlet.txt')
for line in file:
    text = text + line
file.close()
# 打印输出 所有的文档
#print(text)
# 查看某单词个数
result1 = re.findall(' to ',text)
print( len(result1) )

# 正则表达式中 . 表示的是任意字母
result2 = re.findall('a..',text)
print( len(result2) )

result3 = re.findall(' *([Aa][a-z][a-z]) ',text)
result3 = set(result3)  # 去掉重复的
print( result3 )

result4 = re.findall(' (a[a-z][a-z]) | (A[a-z][a-z])',text)
final_result4 = set()
for pair in result4:
    if pair[0] not in final_result4:
        final_result4.add(pair[0])
    if pair[1] not in final_result4:
        final_result4.add(pair[1])
final_result4.remove('') # 去掉空
print(final_result4)

result5 = re.findall('\d+',text)
result6 = re.findall('\d[2]',text)
result7 = re.findall('\w[3,5]',text)
print(result5)
print(result6)
print(result7)